librarian::shelf(
DT, dplyr, dismo, GGally, here, readr, tidyr)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
select <- dplyr::select # overwrite raster::select
options(readr.show_col_types = F)
dir_data <- here("data/sdm")
pts_env_csv <- file.path(dir_data, "pts_env.csv")
pts_env <- read_csv(pts_env_csv)
nrow(pts_env)
## [1] 20000
datatable(pts_env, rownames = F)
Creating pairs plots using GGally to show correlations between variables. Halina notes: consider dropping ER_topoWet and ER_tri
GGally::ggpairs(
select(pts_env, -ID),
aes(color = factor(present)))
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 59 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 18 rows containing missing values
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning in cor(x, y): the standard deviation is zero
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 64 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 64 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 64 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 25 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 16 rows containing missing values
## Warning: Removed 59 rows containing missing values (geom_point).
## Warning: Removed 64 rows containing missing values (geom_point).
## Warning: Removed 64 rows containing missing values (geom_point).
## Warning: Removed 64 rows containing missing values (geom_point).
## Warning: Removed 59 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 64 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 59 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 59 rows containing missing values
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 25 rows containing missing values (geom_point).
## Warning: Removed 64 rows containing missing values (geom_point).
## Warning: Removed 18 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 18 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 18 rows containing missing values
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 59 rows containing missing values (geom_point).
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 16 rows containing missing values (geom_point).
## Warning: Removed 59 rows containing missing values (geom_point).
## Warning: Removed 18 rows containing missing values (geom_point).
# setup model data
d <- pts_env %>%
select(-ID) %>% # remove terms we don't want to model
tidyr::drop_na() # drop the rows with NA values
nrow(d)
## [1] 19931
# fit a linear model
mdl <- lm(present ~ ., data = d)
summary(mdl)
##
## Call:
## lm(formula = present ~ ., data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9845 -0.4210 0.1378 0.4142 1.2550
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.614e-01 7.994e-02 9.526 < 2e-16 ***
## WC_alt -3.899e-05 1.167e-05 -3.340 0.00084 ***
## WC_bio1 3.529e-02 2.022e-03 17.455 < 2e-16 ***
## WC_bio2 -1.918e-02 1.955e-03 -9.810 < 2e-16 ***
## ER_tri -3.639e-03 1.886e-04 -19.293 < 2e-16 ***
## ER_topoWet -8.796e-02 3.889e-03 -22.616 < 2e-16 ***
## lon -1.216e-03 2.850e-04 -4.267 1.99e-05 ***
## lat 1.528e-02 1.463e-03 10.446 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4672 on 19923 degrees of freedom
## Multiple R-squared: 0.1273, Adjusted R-squared: 0.1269
## F-statistic: 415 on 7 and 19923 DF, p-value: < 2.2e-16
Halina notes: predict() is creating a RasterLayer with a prediction based on a model, not 100% sure what the type argument is doing but I know we want the y-predict to be our response or y-values. But basically this is the part where we are running the logit regression so that our range is constrained between 0 and 1. We don’t want predictions outside of this range.
y_predict <- predict(mdl, d, type="response")
y_true <- d$present
range(y_predict)
## [1] -0.2549701 0.9845352
range(y_true)
## [1] 0 1
The problem with these predictions is that it ranges outside the possible values of present 1 and absent 0. (Later we’ll deal with converting values within this range to either 1 or 0 by applying a cutoff value; i.e. any values > 0.5 become 1 and below become 0.)
Halina note: also my adjusted r squared is pretty small?
# fit a generalized linear model with a binomial logit link function
mdl <- glm(present ~ ., family = binomial(link="logit"), data = d)
summary(mdl)
##
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"),
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1521 -1.0310 0.4546 1.0167 2.6916
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.221e+00 3.794e-01 3.217 0.001296 **
## WC_alt -1.636e-04 5.533e-05 -2.957 0.003110 **
## WC_bio1 1.636e-01 9.733e-03 16.810 < 2e-16 ***
## WC_bio2 -8.643e-02 9.373e-03 -9.221 < 2e-16 ***
## ER_tri -1.793e-02 9.664e-04 -18.558 < 2e-16 ***
## ER_topoWet -4.147e-01 1.881e-02 -22.046 < 2e-16 ***
## lon -5.046e-03 1.310e-03 -3.853 0.000117 ***
## lat 7.361e-02 7.011e-03 10.499 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 27630 on 19930 degrees of freedom
## Residual deviance: 24926 on 19923 degrees of freedom
## AIC: 24942
##
## Number of Fisher Scoring iterations: 4
y_predict <- predict(mdl, d, type="response")
range(y_predict)
## [1] 0.02648005 0.90191499
Excellent, our response is now constrained between 0 and 1. Next, let’s look at the term plots to see the relationship between predictor and response.
# show term plots
termplot(mdl, partial.resid = TRUE, se = TRUE, main = F)
librarian::shelf(mgcv)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# fit a generalized additive model with smooth predictors
mdl <- mgcv::gam(
formula = present ~ s(WC_alt) + s(WC_bio1) +
s(WC_bio2) + s(ER_tri) + s(ER_topoWet) + s(lon) + s(lat),
family = binomial, data = d)
summary(mdl)
##
## Family: binomial
## Link function: logit
##
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(ER_topoWet) +
## s(lon) + s(lat)
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.07374 0.02897 -2.546 0.0109 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(WC_alt) 8.826 8.988 322.85 <2e-16 ***
## s(WC_bio1) 6.795 7.595 445.75 <2e-16 ***
## s(WC_bio2) 8.253 8.640 818.13 <2e-16 ***
## s(ER_tri) 8.690 8.921 73.91 <2e-16 ***
## s(ER_topoWet) 8.542 8.920 61.45 <2e-16 ***
## s(lon) 8.769 8.982 1025.23 <2e-16 ***
## s(lat) 8.901 8.994 361.79 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.405 Deviance explained = 34.9%
## UBRE = -0.092119 Scale est. = 1 n = 19931
Halina note: well my adjusted r squared definitely increased!
# show term plots
plot(mdl, scale=0)
# load extra packages
librarian::shelf(
maptools, sf)
##
## The 'cran_repo' argument in shelf() was not set, so it will use
## cran_repo = 'https://cran.r-project.org' by default.
##
## To avoid this message, set the 'cran_repo' argument to a CRAN
## mirror URL (see https://cran.r-project.org/mirrors.html) or set
## 'quiet = TRUE'.
# show version of maxent
maxent()
## Loading required namespace: rJava
## This is MaxEnt version 3.4.3
# get environmental rasters
# NOTE: the first part of Lab 1. SDM - Explore got updated to write this clipped environmental raster stack
env_stack_grd <- file.path(dir_data, "env_stack.grd")
env_stack <- stack(env_stack_grd)
plot(env_stack, nc=2)
# get presence-only observation points (maxent extracts raster values for you)
obs_geo <- file.path(dir_data, "obs.geojson")
obs_sp <- read_sf(obs_geo) %>%
sf::as_Spatial() # maxent prefers sp::SpatialPoints over newer sf::sf class
# fit a maximum entropy model
mdl <- maxent(env_stack, obs_sp)
## Warning in .local(x, p, ...): 22 (0.55%) of the presence points have NA
## predictor values
## This is MaxEnt version 3.4.3
Halina notes: I’m concerned about this warning - I thought I got rid of the NA’s?
# plot variable contributions per predictor
plot(mdl)
Halina Notes: interesting that ER_tri and ER_topoWet contribute the least to each predictor because they are also variables I would consider removing since their correlation number was greater than 0.7.
# plot term plots
response(mdl)
# predict
y_predict <- predict(env_stack, mdl) #, ext=ext, progress='')
plot(y_predict, main='Maxent, raw prediction')
data(wrld_simpl, package="maptools")
plot(wrld_simpl, add=TRUE, border='dark grey')